# --- Load Necessary Libraries ---
cat("\n---- Loading Necessary Libraries ----\n")
##
## ---- Loading Necessary Libraries ----
# Install and load libraries individually
if (!require(tidyr)) install.packages("tidyr")
## Loading required package: tidyr
library(tidyr)
if (!require(ggplot2)) install.packages("ggplot2")
## Loading required package: ggplot2
library(ggplot2)
if (!require(dplyr)) install.packages("dplyr")
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(dplyr)
if (!require(DataExplorer)) install.packages("DataExplorer")
## Loading required package: DataExplorer
library(DataExplorer)
cat("\nAll required libraries are ready to use.\n")
##
## All required libraries are ready to use.
# Load the dataset
data <- read.csv("car_price.csv")
# --- Data Profiling ---
# Dataset dimensions
cat("---- Dataset Overview ----\n")
## ---- Dataset Overview ----
cat("Dataset dimensions: ", dim(data), "\n") # Rows and columns
## Dataset dimensions: 215 26
cat("Column names: ", names(data), "\n") # List of column names
## Column names: car_ID symboling CarName fueltype aspiration doornumber carbody drivewheel enginelocation wheelbase carlength carwidth carheight curbweight enginetype cylindernumber enginesize fuelsystem boreratio stroke compressionratio horsepower peakrpm citympg highwaympg price
# Data types and structure
cat("\n---- Data Types and Structure ----\n")
##
## ---- Data Types and Structure ----
print(str(data)) # Structure of the dataset
## 'data.frame': 215 obs. of 26 variables:
## $ car_ID : num 1 2 3 4 5 6 7 8 9 NA ...
## $ symboling : num 3 3 1 2 2 2 1 1 1 NA ...
## $ CarName : chr "alfa-romero giulia" "alfa-romero stelvio" "alfa-romero Quadrifoglio" "audi 100 ls" ...
## $ fueltype : chr "gas" "gas" "gas" "gas" ...
## $ aspiration : chr "std" "std" "std" "std" ...
## $ doornumber : chr "two" "two" "two" "four" ...
## $ carbody : chr "convertible" "convertible" "hatchback" "sedan" ...
## $ drivewheel : chr "rwd" "rwd" "rwd" "fwd" ...
## $ enginelocation : chr "front" "front" "front" "front" ...
## $ wheelbase : num 88.6 88.6 94.5 99.8 99.4 ...
## $ carlength : num 169 169 171 177 177 ...
## $ carwidth : num 64.1 64.1 65.5 66.2 66.4 66.3 71.4 71.4 71.4 NA ...
## $ carheight : num 48.8 48.8 52.4 54.3 54.3 53.1 55.7 55.7 55.9 NA ...
## $ curbweight : num 2548 2548 2823 2337 2824 ...
## $ enginetype : chr "dohc" "dohc" "ohcv" "ohc" ...
## $ cylindernumber : chr "four" "four" "six" "four" ...
## $ enginesize : num 130 130 152 109 136 136 136 136 131 NA ...
## $ fuelsystem : chr "mpfi" "mpfi" "mpfi" "mpfi" ...
## $ boreratio : num 3.47 3.47 2.68 3.19 3.19 3.19 3.19 3.19 3.13 NA ...
## $ stroke : num 2.68 2.68 3.47 3.4 3.4 3.4 3.4 3.4 3.4 NA ...
## $ compressionratio: num 9 9 9 10 8 8.5 8.5 8.5 8.3 NA ...
## $ horsepower : num 111 111 154 102 115 110 110 110 140 NaN ...
## $ peakrpm : num 5000 5000 5000 5500 5500 5500 5500 5500 5500 NA ...
## $ citympg : num 21 21 19 24 18 19 19 19 17 NA ...
## $ highwaympg : num 27 27 26 30 22 25 25 25 20 NA ...
## $ price : num 13495 16500 16500 13950 17450 ...
## NULL
# Summary statistics for numeric and categorical columns
cat("\n---- Summary Statistics ----\n")
##
## ---- Summary Statistics ----
print(summary(data))
## car_ID symboling CarName fueltype
## Min. : 1 Min. :-2.0000 Length:215 Length:215
## 1st Qu.: 50 1st Qu.: 0.0000 Class :character Class :character
## Median :103 Median : 1.0000 Mode :character Mode :character
## Mean :102 Mean : 0.8595
## 3rd Qu.:152 3rd Qu.: 2.0000
## Max. :205 Max. : 3.0000
## NA's :30 NA's :30
## aspiration doornumber carbody drivewheel
## Length:215 Length:215 Length:215 Length:215
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## enginelocation wheelbase carlength carwidth
## Length:215 Min. : 86.60 Min. :144.6 Min. :61.80
## Class :character 1st Qu.: 94.50 1st Qu.:166.3 1st Qu.:64.00
## Mode :character Median : 96.95 Median :173.2 Median :65.50
## Mean : 98.77 Mean :174.0 Mean :65.92
## 3rd Qu.:102.40 3rd Qu.:183.5 3rd Qu.:66.90
## Max. :115.60 Max. :202.6 Max. :72.30
## NA's :33 NA's :30 NA's :30
## carheight curbweight enginetype cylindernumber
## Min. :47.8 Min. :1713 Length:215 Length:215
## 1st Qu.:51.6 1st Qu.:2145 Class :character Class :character
## Median :54.1 Median :2420 Mode :character Mode :character
## Mean :53.7 Mean :2556
## 3rd Qu.:55.5 3rd Qu.:2952
## Max. :59.8 Max. :4066
## NA's :30 NA's :30
## enginesize fuelsystem boreratio stroke
## Min. :-500.0 Length:215 Min. :2.540 Min. :2.070
## 1st Qu.: 94.5 Class :character 1st Qu.:3.150 1st Qu.:3.110
## Median : 110.0 Mode :character Median :3.310 Median :3.270
## Mean : 99.9 Mean :3.325 Mean :3.255
## 3rd Qu.: 143.0 3rd Qu.:3.580 3rd Qu.:3.410
## Max. :2000.0 Max. :3.940 Max. :4.170
## NA's :28 NA's :32 NA's :30
## compressionratio horsepower peakrpm citympg
## Min. : 7.000 Min. :-50.00 Min. :4150 Min. :13.00
## 1st Qu.: 8.675 1st Qu.: 69.00 1st Qu.:4800 1st Qu.:19.00
## Median : 9.000 Median : 92.00 Median :5200 Median :24.00
## Mean : 67.985 Mean : 95.95 Mean :5142 Mean :25.15
## 3rd Qu.: 9.432 3rd Qu.:116.00 3rd Qu.:5500 3rd Qu.:30.00
## Max. :1000.000 Max. :288.00 Max. :6600 Max. :49.00
## NA's :27 NA's :27 NA's :30 NA's :30
## highwaympg price
## Min. :16.00 Min. :-1000
## 1st Qu.:25.00 1st Qu.: 7336
## Median :30.00 Median : 9984
## Mean :30.63 Mean :12451
## 3rd Qu.:34.00 3rd Qu.:16448
## Max. :54.00 Max. :45400
## NA's :30 NA's :27
# --- Convert Data Types ---
cat("\n---- Convert IDs and Categorical Columns to Appropriate Types ----\n")
##
## ---- Convert IDs and Categorical Columns to Appropriate Types ----
data$car_ID <- as.factor(data$car_ID) # Assuming car_ID is an identifier
data$symboling <- as.factor(data$symboling) # Assuming symboling is a categorical variable
data$fueltype <- as.factor(data$fueltype)
data$aspiration <- as.factor(data$aspiration)
data$doornumber <- as.factor(data$doornumber)
data$carbody <- as.factor(data$carbody)
data$drivewheel <- as.factor(data$drivewheel)
data$enginelocation <- as.factor(data$enginelocation)
data$enginetype <- as.factor(data$enginetype)
data$cylindernumber <- as.factor(data$cylindernumber)
data$fuelsystem <- as.factor(data$fuelsystem)
# --- Check for Missing Values ---
cat("\n---- Missing Values Check ----\n")
##
## ---- Missing Values Check ----
missing_values <- colSums(is.na(data))
cat("Missing values in each column:\n")
## Missing values in each column:
print(missing_values)
## car_ID symboling CarName fueltype
## 30 30 0 0
## aspiration doornumber carbody drivewheel
## 0 0 0 0
## enginelocation wheelbase carlength carwidth
## 0 33 30 30
## carheight curbweight enginetype cylindernumber
## 30 30 0 0
## enginesize fuelsystem boreratio stroke
## 28 0 32 30
## compressionratio horsepower peakrpm citympg
## 27 27 30 30
## highwaympg price
## 30 27
# --- Handling Missing Values ---
cat("\n---- Handling Missing Values ----\n")
##
## ---- Handling Missing Values ----
# Numeric columns: Median imputation
numeric_cols <- names(data)[sapply(data, is.numeric)]
for (col in numeric_cols) {
if (sum(is.na(data[[col]])) > 0) {
data[[col]][is.na(data[[col]])] <- median(data[[col]], na.rm = TRUE)
cat(sprintf("Missing values in numeric column '%s' filled with median.\n", col))
}
}
## Missing values in numeric column 'wheelbase' filled with median.
## Missing values in numeric column 'carlength' filled with median.
## Missing values in numeric column 'carwidth' filled with median.
## Missing values in numeric column 'carheight' filled with median.
## Missing values in numeric column 'curbweight' filled with median.
## Missing values in numeric column 'enginesize' filled with median.
## Missing values in numeric column 'boreratio' filled with median.
## Missing values in numeric column 'stroke' filled with median.
## Missing values in numeric column 'compressionratio' filled with median.
## Missing values in numeric column 'horsepower' filled with median.
## Missing values in numeric column 'peakrpm' filled with median.
## Missing values in numeric column 'citympg' filled with median.
## Missing values in numeric column 'highwaympg' filled with median.
## Missing values in numeric column 'price' filled with median.
# Categorical columns: Mode imputation
categorical_cols <- names(data)[sapply(data, is.character)]
for (col in categorical_cols) {
if (sum(is.na(data[[col]])) > 0) {
mode_value <- names(which.max(table(data[[col]], useNA = "no")))
data[[col]][is.na(data[[col]])] <- mode_value
cat(sprintf("Missing values in categorical column '%s' filled with mode '%s'.\n", col, mode_value))
}
}
# Final Check: Verify all missing values have been handled
cat("\n---- Final Missing Values Check ----\n")
##
## ---- Final Missing Values Check ----
final_missing_values <- colSums(is.na(data))
cat("Remaining missing values in each column (should be 0):\n")
## Remaining missing values in each column (should be 0):
print(final_missing_values)
## car_ID symboling CarName fueltype
## 30 30 0 0
## aspiration doornumber carbody drivewheel
## 0 0 0 0
## enginelocation wheelbase carlength carwidth
## 0 0 0 0
## carheight curbweight enginetype cylindernumber
## 0 0 0 0
## enginesize fuelsystem boreratio stroke
## 0 0 0 0
## compressionratio horsepower peakrpm citympg
## 0 0 0 0
## highwaympg price
## 0 0
# --- Check for Duplicates ---
cat("\n---- Checking for Duplicates ----\n")
##
## ---- Checking for Duplicates ----
# Calculate the number of duplicate rows
num_duplicates <- nrow(data) - nrow(dplyr::distinct(data))
cat(sprintf("Number of duplicate rows: %d\n", num_duplicates))
## Number of duplicate rows: 28
# --- Handle Duplicates ---
cat("\n---- Handling for Duplicates ----\n")
##
## ---- Handling for Duplicates ----
if (num_duplicates > 0) {
cat("\nDuplicates found. Removing duplicate rows...\n")
data <- dplyr::distinct(data)
cat(sprintf("Duplicates removed. Remaining rows: %d\n", nrow(data)))
} else {
cat("\nNo duplicates found. Dataset remains unchanged.\n")
}
##
## Duplicates found. Removing duplicate rows...
## Duplicates removed. Remaining rows: 187
# --- Final Check ---
cat("\n---- Final Duplicate Check ----\n")
##
## ---- Final Duplicate Check ----
final_num_duplicates <- nrow(data) - nrow(dplyr::distinct(data))
if (final_num_duplicates == 0) {
cat("Success: No duplicate rows exist.\n")
} else {
cat(sprintf("Warning: %d duplicate rows remain after handling.\n", final_num_duplicates))
}
## Success: No duplicate rows exist.
# --- Check Outliers in Numerical Columns ---
cat("\n---- Identifying Columns with Outliers ----\n")
##
## ---- Identifying Columns with Outliers ----
# Loop through numeric columns and create colorful boxplots
numeric_cols <- names(data)[sapply(data, is.numeric)]
# Store a copy of the original data for comparison
data_original <- data
# Visualize the original data (before handling outliers)
for (col in numeric_cols) {
plot <- ggplot(data_original, aes(x = "Original Data", y = .data[[col]])) +
geom_boxplot(outlier.colour = "red", outlier.shape = 16, outlier.size = 3, fill = "lightblue", color = "darkblue") +
labs(
title = paste("Boxplot of", col, "- Original Data"),
x = "", y = col
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
axis.title.y = element_text(size = 12, face = "bold"),
axis.text.y = element_text(size = 10)
) +
scale_y_continuous(labels = scales::comma) # Add commas for large numbers
print(plot)
}














# --- Handling Outliers Using Capping ---
cat("\n---- Handling Outliers Using Capping ----\n")
##
## ---- Handling Outliers Using Capping ----
# Function to identify outliers using the IQR method
identify_outliers <- function(column) {
Q1 <- quantile(column, 0.25, na.rm = TRUE) # First quartile (25th percentile)
Q3 <- quantile(column, 0.75, na.rm = TRUE) # Third quartile (75th percentile)
IQR <- Q3 - Q1 # Interquartile range
lower_bound <- Q1 - 1.5 * IQR # Lower bound
upper_bound <- Q3 + 1.5 * IQR # Upper bound
return(list(lower_bound = lower_bound, upper_bound = upper_bound))
}
# Handle outliers for each numeric column
for (col in numeric_cols) {
# Identify bounds
bounds <- identify_outliers(data[[col]])
lower_bound <- bounds$lower_bound
upper_bound <- bounds$upper_bound
# Count affected rows
outlier_indices <- which(data[[col]] < lower_bound | data[[col]] > upper_bound)
num_outliers <- length(outlier_indices)
total_values <- sum(!is.na(data[[col]])) # Total non-NA values
outlier_percentage <- (num_outliers / total_values) * 100 # Calculate percentage
if (num_outliers > 0) {
# Cap outliers to the lower and upper bounds
data[[col]][data[[col]] < lower_bound] <- lower_bound
data[[col]][data[[col]] > upper_bound] <- upper_bound
# Log the results with improved formatting
cat(sprintf("Column: %-15s | Outliers: %-4d | Percentage: %-6.2f%% | Bounds: [%.2f, %.2f]\n",
col, num_outliers, outlier_percentage, lower_bound, upper_bound))
} else {
cat(sprintf("Column: %-15s | Outliers: None\n", col))
}
}
## Column: wheelbase | Outliers: 6 | Percentage: 3.21 % | Bounds: [82.95, 113.75]
## Column: carlength | Outliers: None
## Column: carwidth | Outliers: 8 | Percentage: 4.28 % | Bounds: [60.00, 70.80]
## Column: carheight | Outliers: None
## Column: curbweight | Outliers: None
## Column: enginesize | Outliers: 16 | Percentage: 8.56 % | Bounds: [28.00, 212.00]
## Column: boreratio | Outliers: None
## Column: stroke | Outliers: 21 | Percentage: 11.23 % | Bounds: [2.67, 3.85]
## Column: compressionratio | Outliers: 36 | Percentage: 19.25 % | Bounds: [7.52, 10.53]
## Column: horsepower | Outliers: 15 | Percentage: 8.02 % | Bounds: [1.00, 185.00]
## Column: peakrpm | Outliers: 2 | Percentage: 1.07 % | Bounds: [3750.00, 6550.00]
## Column: citympg | Outliers: 1 | Percentage: 0.53 % | Bounds: [2.50, 46.50]
## Column: highwaympg | Outliers: 2 | Percentage: 1.07 % | Bounds: [11.50, 47.50]
## Column: price | Outliers: 13 | Percentage: 6.95 % | Bounds: [-6125.00, 30019.00]
# --- Compare Before and After Outlier Handling ---
cat("\n---- Comparing Before and After Outlier Handling ----\n")
##
## ---- Comparing Before and After Outlier Handling ----
for (col in numeric_cols) {
plot <- ggplot() +
geom_boxplot(data = data_original, aes(x = "Original Data", y = .data[[col]]),
outlier.colour = "red", outlier.shape = 16, outlier.size = 3, fill = "lightblue", color = "darkblue") +
geom_boxplot(data = data, aes(x = "After Capping", y = .data[[col]]),
outlier.colour = "red", outlier.shape = 16, outlier.size = 3, fill = "lightgreen", color = "darkblue") +
labs(
title = paste("Comparison of", col, "- Before and After Handling Outliers"),
x = "", y = col
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
axis.title.y = element_text(size = 12, face = "bold"),
axis.text.y = element_text(size = 10)
) +
scale_y_continuous(labels = scales::comma) # Add commas for large numbers
print(plot)
}














# --- Why Use Capping for Predicting Car Prices? ---
cat("\n---- Why Use Capping for Predicting Car Prices? ----\n")
##
## ---- Why Use Capping for Predicting Car Prices? ----
cat(
"1. Outliers in car prices may represent rare but valid cases (e.g., luxury cars).\n",
"2. Removing outliers could result in the loss of important information critical for predicting prices.\n",
"3. Capping limits the influence of extreme values without discarding data, ensuring the model learns from all cases.\n"
)
## 1. Outliers in car prices may represent rare but valid cases (e.g., luxury cars).
## 2. Removing outliers could result in the loss of important information critical for predicting prices.
## 3. Capping limits the influence of extreme values without discarding data, ensuring the model learns from all cases.
# --- Boxplots for Categorical Variables vs Price (Excluding Null/Empty Values) ---
cat("\n---- Boxplots for Categorical Variables Against Price (Excluding Null/Empty Values) ----\n")
##
## ---- Boxplots for Categorical Variables Against Price (Excluding Null/Empty Values) ----
library(ggplot2)
library(dplyr)
# Identify categorical columns
categorical_cols <- names(data)[sapply(data, is.character)]
cat("Categorical Columns Identified:\n")
## Categorical Columns Identified:
print(categorical_cols)
## [1] "CarName"
# Clean data by removing rows with NA or empty values in price or categorical variables
data_cleaned <- data %>%
filter(!is.na(price)) %>%
filter(!apply(., 1, function(row) any(is.na(row) | row == "")))
# Ensure 'data_cleaned' is properly created
if (nrow(data_cleaned) == 0) {
stop("Error: The dataset 'data_cleaned' is empty after removing null or empty values.")
}
# Loop through categorical columns and create boxplots
for (col in categorical_cols) {
if (col == "CarName") {
# Handle 'CarName' by selecting top 10 categories by mode (most frequent)
cat(sprintf("\nColumn '%s' has many categories. Selecting top 10 by frequency.\n", col))
# Get the top 10 most frequent CarNames
top_cars <- data_cleaned %>%
count(CarName) %>%
arrange(desc(n)) %>%
slice(1:10) # Select top 10 by frequency
# Filter cleaned data to include only top 10 categories
filtered_data <- data_cleaned %>% filter(CarName %in% top_cars$CarName)
# Create boxplot for top 10 car names by mode
plot <- ggplot(filtered_data, aes(x = reorder(CarName, price, FUN = median), y = price, fill = CarName)) +
geom_boxplot(outlier.colour = "red", outlier.shape = 16, outlier.size = 2) +
labs(
title = "Price Distribution by Top 10 Most Frequent Car Names",
x = "Car Name",
y = "Price"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
axis.title.x = element_text(size = 12, face = "bold"),
axis.title.y = element_text(size = 12, face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1)
) +
scale_y_continuous(labels = scales::comma)
# Display the plot
print(plot)
} else {
# Standard boxplot for other categorical variables
filtered_data <- data_cleaned %>%
filter(!is.na(.data[[col]]) & .data[[col]] != "")
plot <- ggplot(filtered_data, aes_string(x = col, y = "price", fill = col)) +
geom_boxplot(outlier.colour = "red", outlier.shape = 16, outlier.size = 2) +
labs(
title = paste("Price Distribution by", col),
x = col,
y = "Price"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
axis.title.x = element_text(size = 12, face = "bold"),
axis.title.y = element_text(size = 12, face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1)
) +
scale_y_continuous(labels = scales::comma)
# Display the plot
print(plot)
}
}
##
## Column 'CarName' has many categories. Selecting top 10 by frequency.

# --- Why Visualize Top 10 Most Frequent Car Names? ---
cat("\n---- Why Visualize Top 10 Most Frequent Car Names? ----\n")
##
## ---- Why Visualize Top 10 Most Frequent Car Names? ----
cat(
"Visualizing the top 10 most frequent car names provides a clear and focused analysis of pricing trends for popular models.
Since 'CarName' has a large number of unique categories, including all of them in a single visualization would lead to clutter
and make it difficult to interpret the data. By limiting the analysis to the most frequent categories, we can ensure that the
visualization remains meaningful and actionable. This approach highlights patterns and trends in pricing for cars that are
most relevant to our analysis while avoiding the noise introduced by less frequent categories.\n\n"
)
## Visualizing the top 10 most frequent car names provides a clear and focused analysis of pricing trends for popular models.
## Since 'CarName' has a large number of unique categories, including all of them in a single visualization would lead to clutter
## and make it difficult to interpret the data. By limiting the analysis to the most frequent categories, we can ensure that the
## visualization remains meaningful and actionable. This approach highlights patterns and trends in pricing for cars that are
## most relevant to our analysis while avoiding the noise introduced by less frequent categories.
# --- How We Handled Missing Values ---
cat("\n---- How We Handled Missing Values ----\n")
##
## ---- How We Handled Missing Values ----
cat(
"To ensure accurate and clean visualizations, we also handled missing (NA) and null values in the dataset. Rows with missing values
in the 'price' column or any categorical variable were excluded from the analysis. This was achieved by applying the following steps:\n
1. Filtering out rows where 'price' was NA.\n
2. Removing rows where any categorical column had an NA or empty string value.\n
By performing this cleaning step, we ensured that the data used for visualization is complete and reliable, making the analysis
robust and free from biases introduced by incomplete data."
)
## To ensure accurate and clean visualizations, we also handled missing (NA) and null values in the dataset. Rows with missing values
## in the 'price' column or any categorical variable were excluded from the analysis. This was achieved by applying the following steps:
##
## 1. Filtering out rows where 'price' was NA.
##
## 2. Removing rows where any categorical column had an NA or empty string value.
##
## By performing this cleaning step, we ensured that the data used for visualization is complete and reliable, making the analysis
## robust and free from biases introduced by incomplete data.
# --- Scatterplots for Numerical Variables vs Price ---
cat("\n---- Scatterplots for Numerical Variables Against Price ----\n")
##
## ---- Scatterplots for Numerical Variables Against Price ----
library(ggplot2)
# Clean data by removing rows with NA or empty values in price or numerical variables
data_cleaned <- data %>%
filter(!is.na(price)) %>%
filter(!apply(., 1, function(row) any(is.na(row))))
# Identify numerical columns
numeric_cols <- names(data_cleaned)[sapply(data_cleaned, is.numeric)]
cat("Numerical Columns Identified:\n")
## Numerical Columns Identified:
print(numeric_cols)
## [1] "wheelbase" "carlength" "carwidth" "carheight"
## [5] "curbweight" "enginesize" "boreratio" "stroke"
## [9] "compressionratio" "horsepower" "peakrpm" "citympg"
## [13] "highwaympg" "price"
# Loop through numerical columns and create scatterplots
for (col in numeric_cols) {
if (col != "price") { # Exclude 'price' from comparison
cat(sprintf("\nCreating scatterplot for '%s' vs 'price'.\n", col))
# Create scatterplot with coloring based on 'CarName'
plot <- ggplot(data_cleaned, aes_string(x = col, y = "price")) +
geom_point(aes(color = CarName), alpha = 0.5) +
geom_smooth(method = "lm", color = "red", se = FALSE) +
labs(
title = paste("Relationship Between", col, "and Price"),
x = col,
y = "Price"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
axis.title.x = element_text(size = 12, face = "bold"),
axis.title.y = element_text(size = 12, face = "bold"),
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10),
legend.position = "none" # Optional: remove legend if CarName has too many values
) +
scale_y_continuous(labels = scales::comma) + # Format y-axis with commas
scale_x_continuous(labels = scales::comma) # Format x-axis with commas
# Display the plot
print(plot)
}
}
##
## Creating scatterplot for 'wheelbase' vs 'price'.
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'carlength' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'carwidth' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'carheight' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'curbweight' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'enginesize' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'boreratio' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'stroke' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'compressionratio' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'horsepower' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'peakrpm' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'citympg' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'highwaympg' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

# --- Why Visualize Numerical Variables Against Price? ---
cat("\n---- Why Visualize Numerical Variables Against Price? ----\n")
##
## ---- Why Visualize Numerical Variables Against Price? ----
cat(
"Scatterplots for numerical variables against 'price' help identify patterns, trends, and potential relationships in the data.
By using points colored by 'CarName', we can observe differences between car models while highlighting correlations with linear trendlines.
This analysis is crucial for understanding which numerical features have significant impacts on price."
)
## Scatterplots for numerical variables against 'price' help identify patterns, trends, and potential relationships in the data.
## By using points colored by 'CarName', we can observe differences between car models while highlighting correlations with linear trendlines.
## This analysis is crucial for understanding which numerical features have significant impacts on price.
# --- Feature Engineering for Predicting Car Prices ---
cat("\n---- Feature Engineering for Predicting Car Prices ----\n")
##
## ---- Feature Engineering for Predicting Car Prices ----
library(dplyr)
# 1. **Brand Value Feature**
cat("\nCreating Brand Value Feature...\n")
##
## Creating Brand Value Feature...
data <- data %>%
mutate(CarBrand = tolower(gsub(" .*", "", CarName))) # Extract brand name from CarName
# Standardize brand names
brand_corrections <- c("maxda" = "mazda", "vw" = "volkswagen", "vokswagen" = "volkswagen",
"porcshce" = "porsche", "toyouta" = "toyota")
data$CarBrand <- recode(data$CarBrand, !!!brand_corrections)
# Create average price per brand as a feature
brand_avg_price <- data %>%
group_by(CarBrand) %>%
summarise(BrandAvgPrice = mean(price, na.rm = TRUE), .groups = "drop")
data <- data %>%
left_join(brand_avg_price, by = "CarBrand")
cat("Brand value feature created.\n")
## Brand value feature created.
# 2. **Engine Efficiency**
cat("\nCalculating Engine Efficiency...\n")
##
## Calculating Engine Efficiency...
data <- data %>%
mutate(engine_efficiency = horsepower / enginesize)
cat("Engine efficiency feature created.\n")
## Engine efficiency feature created.
# 3. **Luxury Indicator**
cat("\nCreating a Luxury Indicator Feature...\n")
##
## Creating a Luxury Indicator Feature...
luxury_threshold <- quantile(data$price, 0.85, na.rm = TRUE)
data <- data %>%
mutate(is_luxury = as.factor(ifelse(price > luxury_threshold, "Yes", "No")))
cat(sprintf("Luxury threshold for price: %.2f\n", luxury_threshold))
## Luxury threshold for price: 18401.10
cat("Luxury indicator feature created.\n")
## Luxury indicator feature created.
# 4. **Fuel Economy Score**
cat("\nCalculating Fuel Economy Score...\n")
##
## Calculating Fuel Economy Score...
data <- data %>%
mutate(fuel_economy_score = (citympg + highwaympg) / 2)
cat("Fuel economy score feature created.\n")
## Fuel economy score feature created.
# 5. **Vehicle Age**
cat("\nCalculating Age of Vehicle...\n")
##
## Calculating Age of Vehicle...
if ("year" %in% names(data)) {
data <- data %>%
mutate(year = as.numeric(year),
car_age = 2024 - ifelse(!is.na(year), year, 2020)) # Replace 2024 with the current year
} else {
cat("Column 'year' not found. Defaulting to a hypothetical age (2024 - 2020).\n")
data <- data %>%
mutate(car_age = 2024 - 2020) # Defaulting car_age to a fixed value
}
## Column 'year' not found. Defaulting to a hypothetical age (2024 - 2020).
cat("Vehicle age feature created.\n")
## Vehicle age feature created.
# --- Saving the Cleaned and Feature-Enhanced Dataset ---
cat("\n---- Saving the Feature-Enhanced Dataset ----\n")
##
## ---- Saving the Feature-Enhanced Dataset ----
write.csv(data, "car_price_feature_engineered.csv", row.names = FALSE)
cat("Dataset saved as 'car_price_feature_engineered.csv'.\n")
## Dataset saved as 'car_price_feature_engineered.csv'.
# --- Heatmap of Correlation with Price ---
cat("\n---- Heatmap of Correlation with Price ----\n")
##
## ---- Heatmap of Correlation with Price ----
# Select numerical columns, including price
numeric_cols <- names(data)[sapply(data, is.numeric)]
# Ensure 'price' is included in the numeric columns
if (!"price" %in% numeric_cols) {
stop("Error: 'price' column must be numeric to calculate correlation.")
}
# Compute correlation matrix
correlation_matrix <- cor(data[numeric_cols], use = "complete.obs")
## Warning in cor(data[numeric_cols], use = "complete.obs"): the standard
## deviation is zero
# Extract correlation of variables with 'price'
price_correlation <- correlation_matrix["price", , drop = FALSE] # Drop extra dimensions for clarity
# Convert to a data frame for heatmap plotting
correlation_df <- as.data.frame(as.table(price_correlation))
# Visualize correlation with a heatmap
library(ggplot2)
ggplot(correlation_df, aes(x = Var2, y = Var1, fill = Freq)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0,
name = "Correlation") +
labs(
title = "Heatmap of Correlation with Price",
x = "Variable",
y = ""
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
axis.text.y = element_text(size = 12),
axis.title.x = element_text(size = 14, face = "bold")
)

# --- Save Cleaned Data ---
cat("\n---- Saving Cleaned Data ----\n")
##
## ---- Saving Cleaned Data ----
write.csv(data, "car_price_cleaned.csv", row.names = FALSE)
cat("Cleaned dataset saved as 'car_price_cleaned.csv'.\n")
## Cleaned dataset saved as 'car_price_cleaned.csv'.